import duckdb
import pandas as pd
import sqlalchemy # No need to import duckdb_engine,
# SQLAlchemy will auto-detect
%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = FalseUntitled
Set-up ipython-sql to be able to write sql queries directly
To do it with DuckDB, basically following this guide, just need to make sure ipython-sql, SQLAlchemy and duckdb-engine are installed, besides the core libraries (notebook, pandas, duckdb). If any of them mssing, simply pip install them.
Step 1 is then to import ipython-sql Jupyter extension. It enables SQL cells in Jupyter. It supports inline SQL using %sql and a whole SQL cell starting it with %%sql.
I prefer Quarto to edit my notebooks, and the above still works. However, it seems Quarto’s SQL engine is still only for R since it requires knitr and does not seem to support the combo ipython-sql-SQLAlchemy. So you cannot simply use an SQl chunk like this
```{sql}
SELECT * FROM test;
```But you have to use a standard python chunk and use the %sql or %%sql to be able to write SQL direcly.
Step 2 is to fire-up DuckDB, either in memory or pointing to a file.
%sql duckdb:///:memory:
# %sql duckdb:///path/to/file.dbTest it’s working
%sql SELECT 'Off and flying!' as a_duckdb_column| a_duckdb_column | |
|---|---|
| 0 | Off and flying! |
Load SQLite file
%%sql
-- SQLite Scanner
-- https://duckdb.org/docs/extensions/sqlite_scanner.html
-- TODO: perhaps consider SET GLOBAL sqlite_all_varchar=true;
-- to avoid things getting read as blob
INSTALL sqlite;
LOAD sqlite;
CALL sqlite_attach('data/2022-12-26-ecas-export.db');
PRAGMA show_tables;| name | |
|---|---|
| 0 | android_metadata |
| 1 | meter |
| 2 | reading |
%%sql
SELECT * FROM meter;| _id | name | units | comment | vsf | tsf | cost | fcost | invert | vmsetup | type | currency | scaling | phyunits | bidir | prod | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | b'Strom' | b'' | b'' | b'50' | b'50' | b'' | b'' | b'0' | None | b'0' | b'$' | b'1.0' | b'' | b'0' | b'0' |
| 1 | 2 | b'W\xc3\xa4rmestrom 18.2' | b'' | b'' | b'50' | b'50' | b'' | b'' | b'0' | None | b'0' | b'$' | b'1.0' | b'' | b'0' | b'0' |
| 2 | 3 | b'W\xc3\xa4rmestrom 18.1' | b'' | b'' | b'50' | b'50' | b'' | b'' | b'0' | None | b'0' | b'$' | b'1.0' | b'' | b'0' | b'0' |
| 3 | 4 | b'Wasser' | b'' | b'' | b'50' | b'50' | b'' | b'' | b'0' | None | b'0' | b'$' | b'1.0' | b'' | b'0' | b'0' |
| 4 | 5 | b'L\xc3\xbcftungsanlage' | b'' | b'' | b'50' | b'50' | b'' | b'' | b'0' | None | b'0' | b'$' | b'1.0' | b'' | b'0' | b'0' |
| 5 | 6 | b'Waschmaschine' | b'' | b'' | b'50' | b'50' | b'' | b'' | b'0' | None | b'0' | b'$' | b'1.0' | b'' | b'0' | b'0' |
| 6 | 7 | b'Geschirrsp\xc3\xbcler' | b'' | b'' | b'50' | b'50' | b'' | b'' | b'0' | None | b'0' | b'$' | b'1.0' | b'' | b'0' | b'0' |
| 7 | 8 | b'Offen' | b'' | b'' | b'50' | b'50' | b'' | b'' | b'0' | None | b'0' | b'$' | b'1.0' | b'' | b'0' | b'0' |
| 8 | 9 | b'Reset' | b'' | b'' | b'50' | b'50' | b'' | b'' | b'0' | None | b'0' | b'$' | b'1.0' | b'' | b'0' | b'0' |
| 9 | 10 | b'Nachladen' | b'' | b'' | b'50' | b'50' | b'' | b'' | b'0' | None | b'0' | b'$' | b'1.0' | b'' | b'0' | b'0' |
| 10 | 11 | b'Temp' | b'' | b'' | b'50' | b'50' | b'' | b'' | b'0' | None | b'0' | b'$' | b'1.0' | b'' | b'0' | b'0' |
| 11 | 12 | b'W\xc3\xa4rmer' | b'' | b'' | b'50' | b'50' | b'' | b'' | b'0' | None | b'0' | b'$' | b'1.0' | b'' | b'0' | b'0' |
And filter only strom meter
%%sql
CREATE OR REPLACE VIEW strom_sqlite AS
SELECT
meterid,
-- Blob Functions, because most columns get read as blob
-- https://duckdb.org/docs/sql/functions/blob
decode(date)::DATETIME AS date,
decode(value)::INT AS value
FROM reading WHERE meterid = 1
;
SELECT * FROM strom_sqlite;| meterid | date | value | |
|---|---|---|---|
| 0 | 1 | 2020-11-30 17:35:00 | 12165 |
| 1 | 1 | 2020-11-30 23:59:00 | 12168 |
| 2 | 1 | 2020-12-01 14:23:00 | 12173 |
| 3 | 1 | 2020-12-01 07:18:00 | 12170 |
| 4 | 1 | 2020-12-01 20:21:00 | 12177 |
| ... | ... | ... | ... |
| 360 | 1 | 2022-12-25 19:19:00 | 18732 |
| 361 | 1 | 2022-12-25 21:43:00 | 18733 |
| 362 | 1 | 2022-12-26 08:20:00 | 18734 |
| 363 | 1 | 2022-12-26 09:55:00 | 18735 |
| 364 | 1 | 2022-12-26 12:31:00 | 18738 |
365 rows × 3 columns
And put the consumption data in there
%%sql
CREATE OR REPLACE VIEW strom AS
SELECT
*,
date_sub('minute', lag(date) over(order by date), date) AS minutes,
value - lag(value) over(order by date) AS consumption,
24.0 * 60.0 * consumption / minutes AS consumption_day_equivalent
FROM strom_sqlite
ORDER BY date
;
SELECT * FROM strom;| meterid | date | value | minutes | consumption | consumption_day_equivalent | |
|---|---|---|---|---|---|---|
| 0 | 1 | 2020-11-30 07:07:00 | 12162 | NaN | NaN | NaN |
| 1 | 1 | 2020-11-30 17:35:00 | 12165 | 628.0 | 3.0 | 6.878981 |
| 2 | 1 | 2020-11-30 23:59:00 | 12168 | 384.0 | 3.0 | 11.250000 |
| 3 | 1 | 2020-12-01 07:18:00 | 12170 | 439.0 | 2.0 | 6.560364 |
| 4 | 1 | 2020-12-01 14:23:00 | 12173 | 425.0 | 3.0 | 10.164706 |
| ... | ... | ... | ... | ... | ... | ... |
| 360 | 1 | 2022-12-25 19:19:00 | 18732 | 96.0 | 1.0 | 15.000000 |
| 361 | 1 | 2022-12-25 21:43:00 | 18733 | 144.0 | 1.0 | 10.000000 |
| 362 | 1 | 2022-12-26 08:20:00 | 18734 | 637.0 | 1.0 | 2.260597 |
| 363 | 1 | 2022-12-26 09:55:00 | 18735 | 95.0 | 1.0 | 15.157895 |
| 364 | 1 | 2022-12-26 12:31:00 | 18738 | 156.0 | 3.0 | 27.692308 |
365 rows × 6 columns
Visualize the data
%sql strom << SELECT * FROM strom;Returning data to local variable strom
Of course noisy data, with substantial variation in the consumption day equivalent and there is 1.5 years without data.
import plotly.express as px
fig = px.line(strom, x='date', y="consumption_day_equivalent")
fig.show()import pandas as pd
from pandas_profiling import ProfileReport
#EDA using pandas-profiling
profile = ProfileReport(strom, explorative=True)
profile.to_file("output.html")With the exception of the long period without data, the number of minutes shows there are measurements from a few hours away, to a few days away. Most measurements are between 7 and 16 hours apart. That’s worrisome, as the periods are relatively long. In addition, the measurements are scattered and do not follow a systematic pattern.
import plotly.express as px
fig = px.histogram(strom.query("minutes < 10000"), x="minutes", marginal="box")
fig.show()The consumption day equivalent varies also substantially. Median 8.8, which is consistent with the long-run consumption (equivalent to r 8.8*365 per year.). The distribution has a long tight tail, with very high consumptions, presumably, associated to very short measurements periods.
import plotly.express as px
fig = px.histogram(strom.query("minutes < 10000"), x="consumption_day_equivalent", marginal="box")
fig.show()Well, yeah, as expected, short measurement periods (few minutes) are associated with higher variability, and with the highest and lowest consumptions.
from matplotlib import pyplot
pyplot.scatter(
strom.query("minutes < 10000")["minutes"],
strom.query("minutes < 10000")["consumption_day_equivalent"]
)<matplotlib.collections.PathCollection at 0x152094b4c10>

import plotly.express as px
fig = px.scatter(
data_frame=strom.query("minutes < 10000"),
x="minutes",
y="consumption_day_equivalent", hover_data=['date'],
marginal_x="histogram",
marginal_y="histogram"
)
fig.show()Let’s try to see what hours have the highest consumption. That’s tricky given this messy data. One approach is to just interpoate between data points and assume a constant consumption. That’s of course not realistic (specially during the day), but it would get us closer.
%%sql
SELECT MIN(date), MAX(DATE) FROM strom_sqlite;| min(date) | max("DATE") | |
|---|---|---|
| 0 | 2020-11-30 07:07:00 | 2022-12-26 12:31:00 |
This is pretty inefficient, as it will create a table with as many rows as minutes there are. So more than a million, and thenleft join the actual data to that huge table. We end up with a table with a bunch of nulls, and only observations where there are actual measurements.
%%sql
CREATE OR REPLACE TABLE strom_minute AS
SELECT
minute,
date,
value,
minutes,
consumption,
1.0 * consumption / minutes AS consumption_per_minute
FROM generate_series(
TIMESTAMP '2020-11-30 07:07:00',
TIMESTAMP '2022-12-26 12:31:00',
INTERVAL 1 MINUTE
) tbl(minute)
LEFT JOIN strom
ON minute = strom.date
;
SELECT * FROM strom_minute;| minute | date | value | minutes | consumption | consumption_per_minute | |
|---|---|---|---|---|---|---|
| 0 | 2020-11-30 17:35:00 | 2020-11-30 17:35:00 | 12165.0 | 628.0 | 3.0 | 0.004777 |
| 1 | 2020-11-30 23:59:00 | 2020-11-30 23:59:00 | 12168.0 | 384.0 | 3.0 | 0.007812 |
| 2 | 2020-12-01 07:18:00 | 2020-12-01 07:18:00 | 12170.0 | 439.0 | 2.0 | 0.004556 |
| 3 | 2020-12-01 14:23:00 | 2020-12-01 14:23:00 | 12173.0 | 425.0 | 3.0 | 0.007059 |
| 4 | 2020-12-01 20:21:00 | 2020-12-01 20:21:00 | 12177.0 | 358.0 | 4.0 | 0.011173 |
| ... | ... | ... | ... | ... | ... | ... |
| 1088960 | 2022-12-26 12:26:00 | NaT | NaN | NaN | NaN | NaN |
| 1088961 | 2022-12-26 12:27:00 | NaT | NaN | NaN | NaN | NaN |
| 1088962 | 2022-12-26 12:28:00 | NaT | NaN | NaN | NaN | NaN |
| 1088963 | 2022-12-26 12:29:00 | NaT | NaN | NaN | NaN | NaN |
| 1088964 | 2022-12-26 12:30:00 | NaT | NaN | NaN | NaN | NaN |
1088965 rows × 6 columns
And now we just interpolate the consumption per minute, filling the nulls with the next non-null value (i.e. the consumption is constant in all the measurement period -all the minutes between one measurement and the other-). TODO: this uses a correlated subquery. Look for a better solution https://dba.stackexchange.com/questions/279039/how-to-get-the-last-non-null-value-that-came-before-the-current-row
%%sql
CREATE OR REPLACE VIEW consumption_minute AS
select
*,
case when consumption_per_minute is null then
(select consumption_per_minute
from strom_minute t2
where t2.minute > t1.minute and consumption_per_minute is not null
order by minute
limit 1)
else consumption_per_minute end as cm
FROM strom_minute t1
ORDER BY t1.minute
;%%sql
toy << SELECT * FROM consumption_minute ORDER BY minute LIMIT 5000;Returning data to local variable toy
Now we can simply aggregate per day and hour, and the average will be correct, as all the rows have comparable units (consumption for one minute, with equal weight).
%%sql
consumption_hour_avg << SELECT
hour(minute) AS hour,
AVG(cm)*60*24*365 AS consumption
FROM consumption_minute
GROUP BY hour(minute)
;Returning data to local variable consumption_hour_avg
import plotly.express as px
fig = px.bar(consumption_hour_avg, y='consumption', x='hour')
fig.show()Ok, good enough. But this includes a very long period withour measurements, which would have the effect to smooth everything. Let’s take that chunk out to see how it looks.
%%sql
consumption_hour_avg << SELECT
hour(minute) AS hour,
AVG(cm)*60*24*365 AS consumption
FROM consumption_minute
WHERE minute <= '2021-05-25' OR minute >= '2022-11-30'
GROUP BY hour(minute)
;Returning data to local variable consumption_hour_avg
That looks more accurate. It still should have some smoothing going on, giving that there are still long-ish periods without measurements (a few days).
import plotly.express as px
fig = px.bar(consumption_hour_avg, y='consumption', x='hour')
fig.show()%%sql
select * from consumption_hour_avg;| hour | consumption | |
|---|---|---|
| 0 | 0 | 2855.010478 |
| 1 | 1 | 2461.214385 |
| 2 | 2 | 2395.350274 |
| 3 | 3 | 2388.847036 |
| 4 | 4 | 2390.378867 |
| 5 | 5 | 2433.445357 |
| 6 | 6 | 2469.696091 |
| 7 | 7 | 2626.703842 |
| 8 | 8 | 3064.181390 |
| 9 | 9 | 3506.151287 |
| 10 | 10 | 3707.413794 |
| 11 | 11 | 3803.691548 |
| 12 | 12 | 3805.035654 |
| 13 | 13 | 3814.438403 |
| 14 | 14 | 3860.655194 |
| 15 | 15 | 3853.126255 |
| 16 | 16 | 3881.973769 |
| 17 | 17 | 3876.919397 |
| 18 | 18 | 3838.138196 |
| 19 | 19 | 3756.071784 |
| 20 | 20 | 3708.936021 |
| 21 | 21 | 3564.792813 |
| 22 | 22 | 3450.061564 |
| 23 | 23 | 3320.032412 |
(x1+…+x12)/12 400/12
%sql toy << SELECT * FROM consumption_minute WHERE year(minute) >= 2022 AND month(minute) > 11;Returning data to local variable toy
%%sql
toy << SELECT
*,
'H'||hour(minute) AS hour
FROM consumption_minute
WHERE minute <= '2021-05-25' OR minute >= '2022-11-30'
;Returning data to local variable toy
px.line(toy, x='minute', y='cm')